##### ########################################################## ######
#####                                                            ######
#####   Input: Debates                                           ######
#####   Output: 1) Repetition scores at the MP-debate level
#####   Output: 2) Repetition scores at the sentence level for validation
#####
##### ########################################################## ######

rm(list = ls())

library(data.table) # CRAN v.1.13.6
library(plyr) # CRAN v.1.8.6

calculate_ind_debate <- T # about 18 mins
calculate_sentence <- T ## about 7 hours

zipit <- function(raw_texts){
  
  # Function to zip (gzip) text files and calculate compression ratio
  
  tmp_dir <- tempdir(check = TRUE)
  
  n_raw <- length(raw_texts)
  cuts <- unique(c(seq(1,length(raw_texts),1000),length(raw_texts)+1))
  n_segments <- length(cuts)-1
  ratio_out <- rep(NA, length(raw_texts))
  for(seg in 1:n_segments){
    print(paste0("Compressing texts ",cuts[seg], " to ", cuts[seg+1]-1," (total = ",n_raw,")"))
    text <- raw_texts[cuts[seg]:(cuts[seg+1]-1)]
    
    
    ## Write speech to temporary file
    for(i in 1:length(text)){
      sink(paste0(tmp_dir,"/",i,"_tmp.txt"))
      cat(text[i])
      sink()
    }
    
    ## In system, gzip speech and calculate compressed and uncompressed sizes
    system(paste0("cd ", tmp_dir))
    system(paste0("gzip -r ", tmp_dir, "/*.txt*"))
    ratio <- system(paste0("gzip -r -l ", tmp_dir, "|awk '{print $3}'"), intern = T)[-1]
    textname <- system(paste0("gzip -r -l ", tmp_dir, "|awk '{print $4}'"), intern = T)[-1]
    textname <- as.numeric(gsub("_tmp.txt","",substring(textname, nchar(tmp_dir)+2,10000)))
    ratio <- ratio[order(textname, decreasing = F)]
    system(paste0("rm -r ", tmp_dir, "/*"))
    
    ratio <- as.numeric(gsub("%","",ratio))/100
    ratio_out[cuts[seg]:(cuts[seg+1]-1)] <- ratio
  }
  
  return(ratio_out)
}



trim <- function(s) gsub("^[[:space:]]+|[[:space:]]+$","",s)
stdize <- function(x) (x - mean(x, na.rm = T))/sd(x, na.rm = T)

if(calculate_ind_debate){
  
  load("data/debates.Rdata")
  
  ## Repetition test
  
  tmp <- debates[,list(body = paste0(body, collapse = " "),
                       n_words = sum(n_words)
                       ),
                 by = list(person_id, section_id)]
    
  time_taken <- system.time({ratio <- zipit(tmp$body)})
    
  tmp$ratio <- ratio
    
  repetition_scores <- tmp[,c("person_id","section_id","n_words","ratio")]
    
  repetition_scores$repetition <- repetition_scores$ratio
  repetition_scores$repetition[repetition_scores$repetition == Inf] <- NA
    
  save(repetition_scores, file = "working/repetition_mp_debate.Rdata")
  print(time_taken)
}

## Calculate sentence level repetition scores

if(calculate_sentence){ 
  
  # Load data
  
  load("working/dictionaries_sentence.Rdata")
  
  time_taken <- system.time({ratio <- zipit(dictionary_scores$sent)})
  
  repetition_scores <- data.table(
    data.frame(epobject_id = dictionary_scores$epobject_id,
               n_words = dictionary_scores$n_words,
               ratio = ratio))
  
  repetition_scores$repetition <- ifelse(ratio < 0, 0, ratio)
  
  repetition_scores$repetition_std <- stdize(repetition_scores$repetition)
  
  save(repetition_scores, file = "working/repetition_sentence.Rdata")

  print(time_taken)
    
}


